{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "6fdba594",
   "metadata": {},
   "source": [
    "# PrivateGPT Tutorial"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "05999c63",
   "metadata": {},
   "source": [
    "In this tutorial, we demonstrate how to load a collection of PDFs and query them using a PrivateGPT-like workflow."
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "40100c9c",
   "metadata": {},
   "source": [
    "<table align=\"left\">\n",
    "  <td>\n",
    "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/georgia-tech-db/eva/blob/master/tutorials/13-privategpt.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /> Run on Google Colab</a>\n",
    "  </td>\n",
    "  <td>\n",
    "    <a target=\"_blank\" href=\"https://github.com/georgia-tech-db/eva/blob/master/tutorials/13-privategpt.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /> View source on GitHub</a>\n",
    "  </td>\n",
    "  <td>\n",
    "    <a target=\"_blank\" href=\"https://github.com/georgia-tech-db/eva/raw/master/tutorials/13-privategpt.ipynb\"><img src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /> Download notebook</a>\n",
    "  </td>\n",
    "</table><br><br>"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "d53ef889",
   "metadata": {},
   "source": [
    "### Connect to EvaDB\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "b6b7f61d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Note: you may need to restart the kernel to use updated packages.\n",
      "Note: you may need to restart the kernel to use updated packages.\n"
     ]
    }
   ],
   "source": [
    "%pip install --quiet \"evadb[document,notebook]\"\n",
    "%pip install --quiet qdrant_client\n",
    "import evadb\n",
    "cursor = evadb.connect().cursor()\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "cfda671a",
   "metadata": {},
   "source": [
    "### Download PDFs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "1ee8b17b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "File ‘layout-parser-paper.pdf’ already there; not retrieving.\n",
      "\n",
      "File ‘state_of_the_union.pdf’ already there; not retrieving.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "!wget -nc \"https://www.dropbox.com/s/6jdcn33xizdtl9k/layout-parser-paper.pdf\"\n",
    "!wget -nc \"https://www.dropbox.com/s/4q3bvne3m2vsu5g/state_of_the_union.pdf\""
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "1d9c44d4",
   "metadata": {},
   "source": [
    "### Load PDFs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "56913976",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Number of loaded PDF: 1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                         0\n",
       "0  Number of loaded PDF: 1"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cursor.query(\"DROP TABLE IF  EXISTS MyPDFs\").df()\n",
    "cursor.query(\"LOAD PDF 'layout-parser-paper.pdf' INTO MyPDFs\").df()\n",
    "cursor.query(\"LOAD PDF 'state_of_the_union.pdf' INTO MyPDFs\").df()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "972e2b21",
   "metadata": {},
   "source": [
    "### Retrieve Text from Loaded PDFs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "f2674110",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>mypdfs._row_id</th>\n",
       "      <th>mypdfs.name</th>\n",
       "      <th>mypdfs.page</th>\n",
       "      <th>mypdfs.paragraph</th>\n",
       "      <th>mypdfs.data</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>layout-parser-paper.pdf</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>LayoutParser: A Uniﬁed Toolkit for DeepLearnin...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>layout-parser-paper.pdf</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>Zejiang Shen1(�), Ruochen Zhang2, Melissa Dell...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>layout-parser-paper.pdf</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>1 Allen Institute for AIshannons@allenai.org2 ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>layout-parser-paper.pdf</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>Abstract. Recent advances in document image an...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>layout-parser-paper.pdf</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>Keywords: Document Image Analysis · Deep Learn...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>512</th>\n",
       "      <td>2</td>\n",
       "      <td>state_of_the_union.pdf</td>\n",
       "      <td>17</td>\n",
       "      <td>19</td>\n",
       "      <td>Now is our moment to meet and overcome the cha...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>513</th>\n",
       "      <td>2</td>\n",
       "      <td>state_of_the_union.pdf</td>\n",
       "      <td>17</td>\n",
       "      <td>20</td>\n",
       "      <td>And we will, as one people.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>514</th>\n",
       "      <td>2</td>\n",
       "      <td>state_of_the_union.pdf</td>\n",
       "      <td>17</td>\n",
       "      <td>21</td>\n",
       "      <td>One America.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>515</th>\n",
       "      <td>2</td>\n",
       "      <td>state_of_the_union.pdf</td>\n",
       "      <td>17</td>\n",
       "      <td>22</td>\n",
       "      <td>The United States of America.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>516</th>\n",
       "      <td>2</td>\n",
       "      <td>state_of_the_union.pdf</td>\n",
       "      <td>17</td>\n",
       "      <td>23</td>\n",
       "      <td>May God bless you all. May God protect our tro...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>517 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     mypdfs._row_id              mypdfs.name  mypdfs.page  mypdfs.paragraph  \\\n",
       "0                 1  layout-parser-paper.pdf            1                 1   \n",
       "1                 1  layout-parser-paper.pdf            1                 2   \n",
       "2                 1  layout-parser-paper.pdf            1                 3   \n",
       "3                 1  layout-parser-paper.pdf            1                 4   \n",
       "4                 1  layout-parser-paper.pdf            1                 5   \n",
       "..              ...                      ...          ...               ...   \n",
       "512               2   state_of_the_union.pdf           17                19   \n",
       "513               2   state_of_the_union.pdf           17                20   \n",
       "514               2   state_of_the_union.pdf           17                21   \n",
       "515               2   state_of_the_union.pdf           17                22   \n",
       "516               2   state_of_the_union.pdf           17                23   \n",
       "\n",
       "                                           mypdfs.data  \n",
       "0    LayoutParser: A Uniﬁed Toolkit for DeepLearnin...  \n",
       "1    Zejiang Shen1(�), Ruochen Zhang2, Melissa Dell...  \n",
       "2    1 Allen Institute for AIshannons@allenai.org2 ...  \n",
       "3    Abstract. Recent advances in document image an...  \n",
       "4    Keywords: Document Image Analysis · Deep Learn...  \n",
       "..                                                 ...  \n",
       "512  Now is our moment to meet and overcome the cha...  \n",
       "513                       And we will, as one people.   \n",
       "514                                      One America.   \n",
       "515                     The United States of America.   \n",
       "516  May God bless you all. May God protect our tro...  \n",
       "\n",
       "[517 rows x 5 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cursor.query(\"SELECT * FROM MyPDFs\").df()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "fe2f4b4a",
   "metadata": {},
   "source": [
    "### Create Sentence Transformer Feature Extractor UDF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "cf50b02f-eb7e-4aaf-be43-7a57d38149e5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "File ‘sentence_feature_extractor.py’ already there; not retrieving.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Adding Emotion detection\n",
    "!wget -nc https://raw.githubusercontent.com/georgia-tech-db/eva/master/evadb/functions/sentence_feature_extractor.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "a74e9461-42e0-4682-83e2-005a56f523d9",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "09-06-2023 16:25:21 WARNING[drop_object_executor:drop_object_executor.py:_handle_drop_function:0083] Function SentenceFeatureExtractor does not exist, therefore cannot be dropped.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Function SentenceFeatureExtractor successfully...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                   0\n",
       "0  Function SentenceFeatureExtractor successfully..."
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cursor.query(\"DROP FUNCTION IF EXISTS SentenceFeatureExtractor\").df()\n",
    "cursor.query(\"CREATE FUNCTION SentenceFeatureExtractor IMPL 'sentence_feature_extractor.py'\").df()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "1036139a",
   "metadata": {},
   "source": [
    "#### Configure Dataframe Display"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "fd70a210",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "pd.set_option('display.max_colwidth', None)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "f502eddb",
   "metadata": {},
   "source": [
    "### Create vector index for paragraphs of pdf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "d8b1abe7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Index qdrant_indexs successfully added to the database.</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                         0\n",
       "0  Index qdrant_indexs successfully added to the database."
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cursor.query(\"\"\"\n",
    "    CREATE INDEX qdrant_indexs\n",
    "    ON MyPDFs (SentenceFeatureExtractor(data))\n",
    "    USING QDRANT\n",
    "\"\"\").df()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "66b0bd42",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>mypdfs.data</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>That’s why the NATO Alliance was created to secure peace and stability in Europe after World War 2.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>For that purpose we’ve mobilized American ground forces, air squadrons, and ship deployments to protect NATO countries including Poland, Romania, Latvia, Lithuania,and Estonia.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>We spent months building a coalition of other freedom-loving nations from Europe and the Americas to Asia and Africa to confront Putin.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>As I have made crystal clear the United States and our Allies will defend every inch of territory of NATO countries with the full force of our collective power.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>He thought the West and NATO wouldn’t respond. And he thought he could divide us athome. Putin was wrong. We were ready.  Here is what we did.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>512</th>\n",
       "      <td>3.4Storage and visualization</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>513</th>\n",
       "      <td>Get rid of outdated rules that stop doctors from prescribing treatments. And stop the flow of illicit drugs by working with state and local law enforcement to go after traffickers.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>514</th>\n",
       "      <td>Heath’s widow Danielle is here with us tonight. They loved going to Ohio State football games. He loved building Legos with their daughter.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>515</th>\n",
       "      <td>But cancer from prolonged exposure to burn pits ravaged Heath’s lungs and body.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>516</th>\n",
       "      <td>This is personal to me and Jill, to Kamala, and to so many of you.</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>517 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                                                                                                                                               mypdfs.data\n",
       "0                                                                                     That’s why the NATO Alliance was created to secure peace and stability in Europe after World War 2. \n",
       "1        For that purpose we’ve mobilized American ground forces, air squadrons, and ship deployments to protect NATO countries including Poland, Romania, Latvia, Lithuania,and Estonia. \n",
       "2                                                 We spent months building a coalition of other freedom-loving nations from Europe and the Americas to Asia and Africa to confront Putin. \n",
       "3                       As I have made crystal clear the United States and our Allies will defend every inch of territory of NATO countries with the full force of our collective power.  \n",
       "4                                        He thought the West and NATO wouldn’t respond. And he thought he could divide us athome. Putin was wrong. We were ready.  Here is what we did.   \n",
       "..                                                                                                                                                                                     ...\n",
       "512                                                                                                                                                           3.4Storage and visualization\n",
       "513  Get rid of outdated rules that stop doctors from prescribing treatments. And stop the flow of illicit drugs by working with state and local law enforcement to go after traffickers. \n",
       "514                                           Heath’s widow Danielle is here with us tonight. They loved going to Ohio State football games. He loved building Legos with their daughter. \n",
       "515                                                                                                       But cancer from prolonged exposure to burn pits ravaged Heath’s lungs and body. \n",
       "516                                                                                                                    This is personal to me and Jill, to Kamala, and to so many of you. \n",
       "\n",
       "[517 rows x 1 columns]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "query = \"\"\"\n",
    "    SELECT data\n",
    "    FROM MyPDFs\n",
    "    ORDER BY Similarity(SentenceFeatureExtractor('When was the NATO created?'), SentenceFeatureExtractor(data))\n",
    "\"\"\"\n",
    "cursor.query(query).df()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  },
  "vscode": {
   "interpreter": {
    "hash": "292337e8e9747092192f4a1ef18b0951099c869b0f06eb7241460e1768f24923"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
